Let me tell you the story of how

I lost $98 Million…

ggplot(corn %>% 
         filter(date > '2007-01-01' & date < '2016-01-01'), 
       aes(x = date, y = value)) + 
  geom_line() + ggtitle('Historical Corn Prices - USD ($) Per Bushel')

Working at a fund

ggplot(rain %>%
         filter(date > '2007-01-01'), 
       aes(x = date, y = pr, col=Country)) + geom_line()

c = corn %>% 
  tbl_time(date) %>%
  as_period('monthly', side='start') %>%
  mutate(date = lubridate::floor_date(date,'month'))

r = rain %>%
  select(date, Country, pr) %>%
  spread(Country, pr)

df = c %>%
  left_join(r, by = 'date') %>%
  rename(corn = value) %>%
  drop_na() #%>% gather(nam,val,-date)

p = ggplot(df) 
p + geom_point(aes(x = corn, y = AUS)) 

p + geom_point(aes(x = corn, y = BRA))

p + geom_point(aes(x = corn, y = USA))

plot(df %>% select(-date))

Time series analysis:

What is a stationary time series?

“A time series is stationary if a shift in time doesn’t cause a change in the shape of the distribution. Basic properties of the distribution like the mean, variance, and covariance are constant over time.”

Stationary vs Seasonal Time series - http://www.statisticshowto.com/stationarity/

Stationary vs Seasonal Time series - http://www.statisticshowto.com/stationarity/

Different types of stationarity:

Why do we care about stationarity?

Most statistical models that have been developed assume stationarity.

# Example from: https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/
AirPassengers
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 112 118 132 129 121 135 148 148 136 119 104 118
## 1950 115 126 141 135 125 149 170 170 158 133 114 140
## 1951 145 150 178 163 172 178 199 199 184 162 146 166
## 1952 171 180 193 181 183 218 230 242 209 191 172 194
## 1953 196 196 236 235 229 243 264 272 237 211 180 201
## 1954 204 188 235 227 234 264 302 293 259 229 203 229
## 1955 242 233 267 269 270 315 364 347 312 274 237 278
## 1956 284 277 317 313 318 374 413 405 355 306 271 306
## 1957 315 301 356 348 355 422 465 467 404 347 305 336
## 1958 340 318 362 348 363 435 491 505 404 359 310 337
## 1959 360 342 406 396 420 472 548 559 463 407 362 405
## 1960 417 391 419 461 472 535 622 606 508 461 390 432
plot(AirPassengers)
abline(reg = lm(AirPassengers ~ time(AirPassengers)))

#convert to df for rolling operations
ap = matrix(AirPassengers, nco = 1)
t1 = as.integer(min(time(AirPassengers)))
t2 = as.integer(max(time(AirPassengers)))
y = sort(rep(seq(from = t1, to = t2, by = 1), t2-t1 + 1))
m = rep(seq(from = 1, to = 12, by = 1), t2-t1 + 1)
apdf = tibble(year = y, month = m, passengers = ap[,1])
apdf = apdf %>% mutate(date = as.Date(paste0(year,'-',month,'-01'))) %>%
  select(date,passengers)
#remove last 3 years of data
apdf_3 = apdf %>% mutate(x = 1:nrow(apdf)) %>% filter(date < '1958-01-01')
ts_fit = lm(passengers ~ x, data = apdf_3)
ts_fit
## 
## Call:
## lm(formula = passengers ~ x, data = apdf_3)
## 
## Coefficients:
## (Intercept)            x  
##      95.042        2.493
intercept = ts_fit$coefficients[1]
slope = ts_fit$coefficients[2]

apdf_prediction = apdf_3 %>% mutate(type = 'historical') %>%
  bind_rows(apdf %>% filter(date >= '1958-01-01') %>% mutate(type='actual')) %>%
  mutate(x = 1:nrow(apdf)) %>%
  mutate(prediction = (x*slope) + intercept)

ggplot(apdf_prediction, aes(x=date, y=passengers, col=type)) + 
  geom_line() + 
  geom_line(aes(y=prediction), col='black')

rolling_mean = rollify(mean, window = 8)
tmp = apdf %>% mutate(ma = rolling_mean(passengers)) %>% drop_na()
ggplot(tmp, aes(x=date)) + geom_line(aes(y=passengers)) + geom_line(aes(y=ma), col = 'red')

Decomposition Example

Multiplicative model

Forecast = Trend * Seasonality * Cycle * Error

Decomposition Google Sheet

Do transformations!

plot(log(AirPassengers))

plot(diff(AirPassengers))

plot(diff(log(AirPassengers)))

adf.test(diff(log(AirPassengers)))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(log(AirPassengers))
## Dickey-Fuller = -6.4313, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
acf(log(AirPassengers))

ACF is good for identifying MA component

acf(diff(log(AirPassengers)))

PACF is good for identifying AR component

pacf(diff(log(AirPassengers)))


The ARIMA Model

Function of (p,d,q)

AR – p – number of lags

I – d – number of differences

MA – q – error of the model as a combination of previous error terms

Non-Seasonal ARIMA Model is built: ARIMA

fit <- arima(log(AirPassengers), 
             order = c(0, 1, 1))
pred <- predict(fit, n.ahead = 10*12)
ts.plot(AirPassengers,2.718^pred$pred, log = "y", lty = c(1,3))

fit <- arima(log(AirPassengers), 
             order = c(0, 1, 1),
             seasonal = list(order = c(0, 1, 1), 
                             period = 12))

pred <- predict(fit, n.ahead = 10*12)
ts.plot(AirPassengers,2.718^pred$pred, log = "y", lty = c(1,3))

dow = tq_index("DOW") %>%
  tq_get(get = "stock.prices") %>%
  drop_na()
head(dow, 10)
p = ggplot(dow, aes(x = date, y = close, col = symbol)) + 
  geom_line() + 
  gghighlight(max(close) > 250) + 
  theme_minimal() + 
  facet_wrap(~ symbol)
print(p)

Is BA a stationary time series?

Can you simply zoom in and find a portion that is stationary?

ba = dow %>% 
  filter(symbol == 'BA') %>% 
  select(date, close)
plotly::ggplotly(ggplot(ba, aes(x = date, y = close)) + geom_line() + geom_smooth(method = lm, se = FALSE))
adf.test(zoo::zoo(ba$close, ba$date))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  zoo::zoo(ba$close, ba$date)
## Dickey-Fuller = 0.43376, Lag order = 13, p-value = 0.99
## alternative hypothesis: stationary
ts_fit = lm(close ~ date, data = ba)
ts_fit
## 
## Call:
## lm(formula = close ~ date, data = ba)
## 
## Coefficients:
## (Intercept)         date  
##  -737.12864      0.05401
res = tibble(residuals = resid(ts_fit))
ggplot(res, aes(x = 1:nrow(res), y = residuals)) + geom_point() + geom_rug(sides = 'l')

ggplot(res, aes(x = residuals)) + geom_histogram(aes(y = ..density..), bins = 40) + geom_density(col = 'red', size = 1.5)

Are the daily returns stationary?

ret = ba %>%
  tq_transmute(close, mutate_fun = dailyReturn)
ggplot(ret, aes(x = date, y = daily.returns)) + geom_line() + geom_smooth(method = lm, se = FALSE)

adf.test(zoo::zoo(ret$daily.returns, ret$date))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  zoo::zoo(ret$daily.returns, ret$date)
## Dickey-Fuller = -14.308, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
log_close = ba %>% 
  mutate(log_close = log(close)) %>%
  mutate(daysFromStart = 1:nrow(ba))
lc = log_close %>%
  select(daysFromStart, close, log_close) %>%
  gather(price_type, close, -daysFromStart)

ggplot(lc, 
       aes(daysFromStart, close)) + 
  geom_point() + 
  # Here comes the gganimate code
  transition_states(
    price_type,
    transition_length = 2,
    state_length = 1
  ) +
  view_follow()

Differenced log?

log_close = ba %>% 
  mutate(close = close / 100) %>% # divide by 100 for animation purposes
  mutate(log_close = log(close),
         log_close_diff = 10 * (log_close - lag(log_close))) %>% # multiply by 10 for animation purposes
  mutate(daysFromStart = 1:nrow(ba))

lc = log_close %>%
  select(daysFromStart, close, log_close, log_close_diff) %>%
  gather(price_type, close, -daysFromStart) %>%
  drop_na()

ggplot(lc, 
       aes(daysFromStart, close)) + 
  geom_point(col = 'darkgreen', alpha = 0.5) + 
  # Here comes the gganimate code
  transition_states(
    price_type,
    transition_length = 2,
    state_length = 1
  )

ggplot(lc, 
       aes(close)) + 
  geom_density(fill = 'darkgreen', alpha = 0.25) + 
  # Here comes the gganimate code
  transition_states(
    price_type,
    transition_length = 2,
    state_length = 1
  )

lc = ba %>% 
  mutate(close = close) %>% # divide by 100 for animation purposes
  mutate(log_close = log(close),
         log_close_diff = log_close - lag(log_close)) %>%
  drop_na()

ggplot(lc, aes(x = log_close_diff)) + geom_histogram(bins = 40)